Equal Period

Plots of Tweets and Trains for the entire period, equal length before and after the change of schedule

Imports



In [1]:

    
from pymongo import MongoClient
from datetime import datetime, date, timedelta, time
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.basemap import Basemap
import pymysql
matplotlib.style.use('ggplot')

Help functions



In [2]:

    
def toUnix(datetime):
    unix = datetime.strftime('%s')
    return unix

def convertDatetime(unix):
    dt = datetime.fromtimestamp(unix)
    return dt.strftime('%Y-%m-%d %H:%M:%S')

def convertDate(unix):
    d = date.fromtimestamp(unix)
    return d.strftime('%Y-%m-%d')

Database functions



In [3]:

    
class RailDatabase():
    def __init__(self, isNew):
        if isNew:
            self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='newrailDB')
        else:
            self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='oldrailDB')

    def getAllRoutes(self, date):
        C = self.conn.cursor()
        C.execute('SELECT route_id FROM route WHERE date = %s', (date,))
        rows = C.fetchall()
        C.close()
        return rows

    def getStops(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s', (routeID,))
        rows = C.fetchall()
        C.close()
        return rows

    def getLastStop(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
        rows = C.fetchall()
        last = rows[-1:]
        C.close()
        return last

    def getAllStations(self):
        C = self.conn.cursor()
        C.execute('SELECT * FROM station')
        rows = C.fetchall()
        C.close()
        return rows
    
class TweetDatabase():
    def __init__(self, isNew):
        self.conn = MongoClient().data_science
        self.new = isNew
        
    def connect(self):
        if self.new:
            return self.conn.new_tweets
        else:
            return self.conn.old_tweets
    
class Config():
    def __init__(self, isNew):
        self.new = isNew
        
    ##EQUAL PERIOD BEFORE AND AFTER NEW SCHEDULE##
    def period(self):
        if self.new:
            return (date(2014, 10, 27), date(2015, 2, 1))
        else:
            return (date(2014, 10, 27), date(2014, 12, 14))

Tweets



In [4]:

    
def tweetsPerDay(isNew):
    conf = Config(isNew)
    
    start, stop = conf.period()
    period = pd.date_range(start, stop)

    #Tweets Per Day
    tpd = pd.Series(0, index = period)

    db = TweetDatabase(False).connect()
    oldTweets = db.find()
    for tweet in oldTweets:
        create_datetime = tweet['created_at']
        create_datetime = parser.parse(create_datetime, ignoretz = True)
        create_date = create_datetime.date()
        d = create_date.isoformat()
        tpd[d] += 1

    db = TweetDatabase(True).connect()
    newTweets = db.find()
    for tweet in newTweets:
        create_datetime = tweet['created_at']
        create_datetime = parser.parse(create_datetime, ignoretz = True)
        create_date = create_datetime.date()
        if create_date <= stop:
            d = create_date.isoformat()
            tpd[d] += 1

    return tpd



In [6]:

    
tpd = tweetsPerDay(True)

fig = plt.figure()
plt.bar(np.arange(tpd.size), tpd, color='b')
plt.axvline(49, color='r')
plt.title('Tweets per day')
plt.xlabel('Date')
plt.ylabel('Amount of Tweets')
plt.ylim(ymax=1800)
fig.autofmt_xdate()
plt.savefig('./plots/equal_period_tweets.pdf')
plt.show()

Trains



In [9]:

    
def delaysPerDay(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    limit = date(2014, 12, 14)
    diff = stop - start
    period = pd.date_range(start, stop)
    zeros = np.zeros((diff.days+1, 2))

    #DataFrame with worst-case and avg delay on entire network, per day
    #Delay is computed in a worst-case scenario, i.e. max delay of a train
    #divided by the amount of trains on that day.
    #In avg-case scenario, i.e. avg of avg of arrival and avg of departure delay
    #divided by the amount of trains on that day.
    delays = pd.DataFrame(zeros, index = period, columns = ['Worst case', 'Avg case'])

    while(start <= stop):
        n = start > limit
        db = RailDatabase(n)

        t = time(0, 0, 0)
        dt = datetime.combine(start, t)

        #Get all routes for a specified date.
        #Every trainID rides only once each day, so there is no point in asking all the trainIDs first
        routes = db.getAllRoutes(toUnix(dt))

        #Instead of keeping a list of worst_delays {which is O(n) in memory}, 
        #the worst_delay is accumulated and divided by the amount of data points {which is O(1) in memory}
        total_worst_delay = 0
        worst_delay_count = 0

        total_avg_delay = 0
        avg_delay_count = 0

        for routeRow in routes:
            routeID = routeRow[0]
            stops = db.getStops(routeID)

            max_delay = 0

            total_arrival_delay = 0
            arrival_delay_count = 0

            total_departure_delay = 0
            departure_delay_count = 0

            for stopRow in stops:
                arrival_detected = stopRow[3]
                departure_detected = stopRow[6]
                arrival_delay = stopRow[2]
                departure_delay = stopRow[5]

                if arrival_detected:
                    total_arrival_delay += arrival_delay
                    arrival_delay_count += 1

                if departure_detected:
                    total_departure_delay += departure_delay
                    departure_delay_count += 1

                if max(arrival_delay, departure_delay) > max_delay:
                    max_delay = max(arrival_delay, departure_delay)

            total_worst_delay += max_delay
            worst_delay_count += 1

            if arrival_delay_count == 0:
                arrival_delay_count = 1
            if departure_delay_count == 0:
                departure_delay_count = 1
            avg_arrival = float(total_arrival_delay) / float(arrival_delay_count)
            avg_departure = float(total_departure_delay) / float(departure_delay_count)

            total_avg_delay += np.mean([avg_arrival, avg_departure])
            avg_delay_count += 1

        key = start.isoformat()
        if worst_delay_count == 0:
            worst_delay_count = 1
        if avg_delay_count == 0:
            avg_delay_count = 1
        delays['Worst case'][key] = float(total_worst_delay) / float(worst_delay_count)
        delays['Avg case'][key] = float(total_avg_delay) / float(avg_delay_count)

        delta = timedelta(days=1)
        start = start + delta

    return delays



In [10]:

    
delays = delaysPerDay(True)

rolling = pd.rolling_mean(delays, 4, center=True)
ax_delays = delays.plot(x_compat=True, style='--', color=['r', 'b'])
rolling.plot(color=['r','b'], ax=ax_delays, legend=0)
plt.axvline('2014-12-14', color='g')
plt.title('Delays per day on entire network')
plt.xlabel('Date')
plt.ylabel('Minutes')
plt.savefig('./../../Paper/plots/equal_period_delays.png')



In [8]:

    
def cancelsPerDay(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    limit = date(2014, 12, 14)
    period = pd.date_range(start, stop)

    #Series with the amount of cancelled trains, per day
    #A train is considered cancelled when arrival is not detected at its final stop
    cancels = pd.Series(0, index = period)

    while(start <= stop):
        n = start > limit
        db = RailDatabase(n)

        t = time(0, 0, 0)
        dt = datetime.combine(start, t)
        key = start.isoformat()

        #Get all routes for a specified date.
        #Every trainID rides only once each day, so there is no point in asking all the trainIDs first
        routes = db.getAllRoutes(toUnix(dt))

        for routeRow in routes:
            routeID = routeRow[0]
            lastStop = db.getLastStop(routeID)

            for stopRow in lastStop:
                arrival_detected = stopRow[3]
                if not arrival_detected:
                    cancels[key] += 1

        delta = timedelta(days=1)
        start = start + delta

    return cancels



In [9]:

    
cancels = cancelsPerDay(True)

fig = plt.figure()
plt.bar(np.arange(98), cancels, color='b')
plt.axvline(49, color='r')
plt.title('Cancelled trains per day')
plt.xlabel('Date')
plt.ylabel('Cancelled trains')
fig.autofmt_xdate()
plt.savefig('./plots/equal_period_cancels.pdf')
plt.show()



In [6]:

    
def percentageDelays(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    diff = stop - start
    period = pd.date_range(start, stop)
    zeros = np.zeros((diff.days+1, 2))
    limit = date(2014, 12, 14)

    #DataFrame with the number of delayed trains per day
    #A train is considered delayed when it suffers a delay of more than 5 minutes at any stop
    #The 2nd column is the percentage of delayed trains on that day.
    perctDelays = pd.DataFrame(zeros, index = period, columns = ['Delayed', 'Percentage'])

    while(start <= stop):
        n = start > limit
        db = RailDatabase(n)
        
        t = time(0, 0, 0)
        dt = datetime.combine(start, t)

        #Get all routes for a specified date.
        #Every trainID rides only once each day, so there is no point in asking all the trainIDs first
        routes = db.getAllRoutes(toUnix(dt))

        key = start.isoformat()
        count = len(routes)

        #Get all stops for the route
        for routeRow in routes:
            routeID = routeRow[0]
            stops = db.getStops(routeID)

            #If a stop is found in the route, with > 5min delay
            #then this train is considered delayed!
            for stopRow in stops:
                arrival_detected = stopRow[3]
                departure_detected = stopRow[6]
                arrival_delay = stopRow[2]
                departure_delay = stopRow[5]

                if arrival_detected:
                    if arrival_delay > 5:
                        perctDelays['Delayed'][key] += 1
                        break
                elif departure_detected:
                    if departure_delay > 5:
                        perctDelays['Delayed'][key] += 1
                        break

        perctDelays['Percentage'][key] = (float(perctDelays['Delayed'][key]) / float(count)) * float(100)

        delta = timedelta(days=1)
        start = start + delta

    return perctDelays



In [7]:

    
perctDelays = percentageDelays(True)

fig = plt.figure()
plt.bar(perctDelays.index, perctDelays['Percentage'], color='b')
plt.axvline('2014-12-15', color='r')
plt.title('Percentage of delayed trains per day')
plt.xlabel('Date')
plt.ylabel('Percentage')
plt.ylim(ymax=60)
fig.autofmt_xdate()
plt.savefig('./plots/equal_period_percentage_delays.pdf')
plt.show()



In [ ]: